Could the gloves serve as a pseudo-blank that includes the sample collection step?
library(phyloseq); packageVersion("phyloseq")
## [1] '1.44.0'
# library(Biostrings); packageVersion("Biostrings")
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.2'
library(dplyr); packageVersion("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] '1.1.2'
library(ggrepel); packageVersion("ggrepel")
## [1] '0.9.3'
require(broman); packageVersion("broman") #optional
## Loading required package: broman
## [1] '0.80'
DADA2_part2 = dir("../..", pattern="DADA2_part2", full.names = T, include.dirs = T)
countsFile = file.path(DADA2_part2, "output", "asv-counts.txt")
counts = read.delim2(countsFile, row.names = 1)
dim(counts)
## [1] 3008 952
counts = counts[rowSums(counts) > 0,]
dim(counts)
## [1] 2915 952
head(row.names(counts))
## [1] "ERR1459793" "ERR1459883" "ERR1460576" "ERR1460577" "ERR1460578"
## [6] "ERR1460579"
head(colnames(counts))
## [1] "ASV.1" "ASV.2" "ASV.3" "ASV.4" "ASV.5" "ASV.6"
metaReduced = dir("../..", pattern="metadata_PRJEB14474", full.names = T, include.dirs = T)
metaFile = file.path(metaReduced, "output", "PRJEB14474_SraRunTable_reduced.txt")
meta = read.delim2(metaFile)
row.names(meta) = meta$ID
message("Read metadata from: ", metaFile)
## Read metadata from: ../../02_review_metadata_PRJEB14474/output/PRJEB14474_SraRunTable_reduced.txt
We will need to describe days as being “pre-opening” or “post-opening”. This information is held is “study_day”. Make a new variable that is a simple category instead of a numeric.
meta$opening = "before opening"
meta$opening[meta$study_day > 0] = "after opening"
We should have meta data for all samples we had data for…and for some samples that have been filtered out. Limit the meta data to only include samples we have in the data.
inData = row.names(counts)
meta = meta[inData, ]
dim(meta)
## [1] 2915 14
assignTax = dir("../..", pattern="assign_ASV_taxonomy", full.names = T, include.dirs = T)
taxaFile = file.path(assignTax, "output/silva/asvTaxaTable_silva-v138.txt")
taxa = read.delim2(taxaFile, row.names = "asvID")
# drop the ASV column
taxaLim = taxa[,grep("ASV", names(taxa), invert = T, value = T)]
taxaMat = as.matrix(taxaLim)
# tmpDir = "../temp"
# suppressWarnings(dir.create(tmpDir, recursive = T))
#
outDir = "../output"
suppressWarnings(dir.create(outDir, recursive = T))
The seed each time that the dirction of the pcoa matters; like times that we add annotations or comments with hard-coded values based on the pcoa.
seed = 22
set.seed(seed)
Lets lay out a set of colors to use for each type / category to be consistent between plots.
# library(broman)
myProjectPalette = c(
# match paper figures
before = "#2E5473",
'before opening'= "#2E5473",
Preopening = "#2E5473",
after = "#BB302F",
'after opening'= "#BB302F",
Postopening = "#BB302F",
# Gloves (with extended categories)
Glove1 = "deepskyblue",
Glove2 = "deepskyblue3",
'Glove from Glove Box' = "deepskyblue1",
Glove = "deepskyblue1",
'blank control' = "black", #white? consider: ("#fefe22") brocolors("crayons")["Laser Lemon"]
# Staff_Surface >> muted blue
'Personal Cell Phone' = "#a2a2d0", # brocolors("crayons")["Blue Bell"]
'Hospital Pager' = "#dbd7d2", # brocolors("crayons")["Timberwolf"]
'Shirt Hem' = "#c5d0e6", # brocolors("crayons")["Periwinkle"]
Shoe = "#b0b7c6", # brocolors("crayons")["Cadet Blue"]
# Station_Surface >> purples
'Countertop' = "#dd4492", # brocolors("crayons")["Cerise"]
'Computer Mouse' = "#c364c5", # brocolors("crayons")["Fuchsia"]
'Station Phone'= "#f664af", # brocolors("crayons")["Magenta"]
'Chair Armrest' = "#de5d83", # brocolors("crayons")["Blush"]
'Corridor Floor' = "purple",
# (Room_Surface | Station_Surface) > Water Faucet Handle >> green
'Hot Tap Water Faucet Handle' = "#71bc78", # brocolors("crayons")["Fern"]
'Cold Tap Water Faucet Handle'= "#87a96b", # brocolors("crayons")["Asparagus"]
'Cold Tap Faucet Handle'="#a8e4a0" , # brocolors("crayons")["Granny Smith Apple"]
# Patient_Skin | Staff_Skin >> organic?
'Inguinal Fold'="#ffa089" , # brocolors("crayons")["Vivid Tangerine"]
Hand="#fae7b5" , # brocolors("crayons")["Banana Mania"]
Nose = "#ffcf48" , # brocolors("crayons")["Sunglow"]
# Nose="#c5e384" , # brocolors("crayons")["Yellow Green"]
Axilla="#f78fa7" , # brocolors("crayons")["Pink Sherbert"]
# Room_Surface >> greenish/blueish
Floor = "#2b6cc4" , # brocolors("crayons")["Denim"]
Bedrail = "#aaf0d1" # brocolors("crayons")["Magic Mint"]
)
# to use this with ggplot2, add a scale_*_manual layer:
# plot1 +
# scale_colour_manual(values = myProjectPalette)
In the legend, Figure 1A is described as:
- Principal coordinate analysis (PCoA) of all floor samples based on weighted UniFrac distance and colored by whether they were taken before or after the hospital’s opening.
# alt syntax:
# doPCoA = function(counts=counts, meta=meta, taxaMat=taxaMat, sampleType="Floor"){
doPCoA = function(sampleType="Floor", colorBy = "opening"){
sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
countslim = counts[sampleSet,]
ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE),
sample_data(meta[sampleSet,]),
tax_table(taxaMat))
ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
po = plot_ordination(ps.prop, ord, shape="surcat", color=colorBy,
title=paste0("bray PCoA - ", paste0(sampleType, collapse=", "))) +
scale_colour_manual(values = myProjectPalette)
return(list(ord=ord, plotA=po))
}
set.seed(seed)
gloves = doPCoA(sampleType="Glove from Glove Box")
gloves$plotA
ordGlove = gloves$ord$vectors[,1:3] %>% data.frame()
ordGlove$ID = row.names(ordGlove)
ordGlove = merge(ordGlove, meta)
row.names(ordGlove) = ordGlove$ID
p1 = ggplot(data.frame(ordGlove), aes(x=Axis.1, y=Axis.2, col=SAMPLE_TYPE, label=ID)) +
geom_point() +
scale_colour_manual(values = myProjectPalette) +
ggtitle("Unused Glove")
p1
Ax1.cut = 0.08
p1a = p1 +
# abline parameters and Ax1.cut were set based on a plot made after set.seed(22)
geom_abline(slope=1.1, intercept = -0.1, linetype=2, col="gray") +
geom_vline(xintercept = Ax1.cut)
p1a
p1b = p1a + geom_text_repel(show.legend = F)
p1b
## Warning: ggrepel: 226 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(file.path(outDir, "UnusedGlove_PCOA.png"))
## Saving 7 x 5 in image
## Warning: ggrepel: 226 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Using just axis.1 I can capture most of group2. Only two samples (ERR1461892 and ERR1461651) are left out.
group1 = row.names(ordGlove)[ordGlove[,"Axis.1"] < Ax1.cut]
group2 = row.names(ordGlove)[ordGlove[,"Axis.1"] > Ax1.cut]
ordGlove$group = NA
ordGlove[group1,"group"] = "Glove1"
ordGlove[group2,"group"] = "Glove2"
ggplot(data.frame(ordGlove), aes(x=Axis.1, y=Axis.2, col=group, label=ID)) +
geom_point() +
scale_colour_manual(values = myProjectPalette) +
ggtitle("Unused Glove (grouped)")
allglove = table(meta$study_day[meta$SAMPLE_TYPE=="Glove from Glove Box"]) %>% data.frame()
allglovedf = data.frame(study_day=allglove$Var1, total=allglove$Freq)
g2glove = table(meta[group2, "study_day"]) %>% data.frame()
names(g2glove) = c("study_day", "g2")
df = merge(allglovedf, g2glove, all=T)
df$g2[is.na(df$g2)] = 0
df$portionG2 = df$g2 / df$total
table(df$portionG2)
##
## 0 0.375 0.666666666666667 0.8
## 95 1 1 1
## 0.9 1
## 1 14
There are 95 days where all glove samples are in group1, and 13 days where all glove samples are in group2. And 5 days are a mix. The mix days are:
df[df$portionG2 > 0 & df$portionG2 < 1,]
## study_day total g2 portionG2
## 46 66 5 4 0.8000000
## 97 283 6 4 0.6666667
## 107 311 10 9 0.9000000
## 109 318 8 3 0.3750000
How many glove samples are usually taken on the same day?
table(df$total)
##
## 1 2 3 4 5 6 8 9 10
## 39 61 2 1 2 2 2 3 1
This does not distinguish it by room.
all.glove = meta %>%
filter(SAMPLE_TYPE == "Glove from Glove Box") %>%
count(study_day, Location) %>%
mutate(total=n) %>% select(-n)
g2.glove = meta %>%
filter(ID %in% group2) %>%
count(study_day, Location) %>%
mutate(g2=n) %>% select(-n)
m2 = merge(all.glove, g2.glove, all=T)
m2$g2[is.na(m2$g2)] = 0
m2$portionG2 = as.factor(m2$g2 / m2$total)
p1f = ggplot(m2, aes(x=study_day, y=Location, fill=portionG2)) +
geom_tile()
p1f
p1f + xlim(c(0,10))
## Warning: Removed 224 rows containing missing values (`geom_tile()`).
p1f + xlim(c(55,105))
## Warning: Removed 219 rows containing missing values (`geom_tile()`).
p1f + xlim(c(220,270))
## Warning: Removed 223 rows containing missing values (`geom_tile()`).
p1f + xlim(c(275,330))
## Warning: Removed 176 rows containing missing values (`geom_tile()`).
Save images.
pdf(file.path(outDir, "gloveCollectionDays.pdf"))
p1f
p1f + xlim(c(0,10))
## Warning: Removed 224 rows containing missing values (`geom_tile()`).
p1f + xlim(c(55,105))
## Warning: Removed 219 rows containing missing values (`geom_tile()`).
p1f + xlim(c(220,270))
## Warning: Removed 223 rows containing missing values (`geom_tile()`).
p1f + xlim(c(275,330))
## Warning: Removed 176 rows containing missing values (`geom_tile()`).
dev.off()
## quartz_off_screen
## 2
Where would my left-out samles fall?
leftOut = c("ERR1461892", "ERR1461651")
meta[leftOut, c("ID", "study_day", "Location")]
## ID study_day Location
## ERR1461892 ERR1461892 283 R10015
## ERR1461651 ERR1461651 102 R10016
sampleTypes = unique(meta$SAMPLE_TYPE)
meta[group1,"SAMPLE_TYPE"] = "Glove1"
meta[group2,"SAMPLE_TYPE"] = "Glove2"
eachVsGlove = list()
for (type in sampleTypes){
eachVsGlove[[type]] = doPCoA(sampleType = c("Glove1", "Glove2", type), colorBy="SAMPLE_TYPE")
show(eachVsGlove[[type]]$plotA)
}
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] broman_0.80 ggrepel_0.9.3 dplyr_1.1.2 ggplot2_3.4.2
## [5] phyloseq_1.44.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.3 xfun_0.39 bslib_0.4.2
## [4] rhdf5_2.44.0 Biobase_2.60.0 lattice_0.21-8
## [7] rhdf5filters_1.12.1 vctrs_0.6.2 tools_4.3.0
## [10] bitops_1.0-7 generics_0.1.3 biomformat_1.28.0
## [13] stats4_4.3.0 parallel_4.3.0 tibble_3.2.1
## [16] fansi_1.0.4 highr_0.10 cluster_2.1.4
## [19] pkgconfig_2.0.3 Matrix_1.5-4 data.table_1.14.8
## [22] S4Vectors_0.38.1 lifecycle_1.0.3 GenomeInfoDbData_1.2.10
## [25] farver_2.1.1 compiler_4.3.0 stringr_1.5.0
## [28] textshaping_0.3.6 Biostrings_2.68.1 munsell_0.5.0
## [31] codetools_0.2-19 permute_0.9-7 GenomeInfoDb_1.36.0
## [34] htmltools_0.5.5 sass_0.4.6 RCurl_1.98-1.12
## [37] yaml_2.3.7 pillar_1.9.0 crayon_1.5.2
## [40] jquerylib_0.1.4 MASS_7.3-58.4 cachem_1.0.8
## [43] vegan_2.6-4 iterators_1.0.14 foreach_1.5.2
## [46] nlme_3.1-162 tidyselect_1.2.0 digest_0.6.31
## [49] stringi_1.7.12 reshape2_1.4.4 labeling_0.4.2
## [52] splines_4.3.0 ade4_1.7-22 fastmap_1.1.1
## [55] grid_4.3.0 colorspace_2.1-0 cli_3.6.1
## [58] magrittr_2.0.3 survival_3.5-5 utf8_1.2.3
## [61] ape_5.7-1 withr_2.5.0 scales_1.2.1
## [64] rmarkdown_2.21 XVector_0.40.0 multtest_2.56.0
## [67] igraph_1.4.3 ragg_1.2.5 evaluate_0.21
## [70] knitr_1.42 IRanges_2.34.0 mgcv_1.8-42
## [73] rlang_1.1.1 Rcpp_1.0.10 glue_1.6.2
## [76] BiocGenerics_0.46.0 rstudioapi_0.14 jsonlite_1.8.4
## [79] R6_2.5.1 Rhdf5lib_1.22.0 plyr_1.8.8
## [82] systemfonts_1.0.4 zlibbioc_1.46.0